Imputation
source("gamlss_mice.R", local = knitr::knit_global())

##
## Variables sorted by number of missings:
## Variable Count
## hgt 0.2009612
## sex 0.2008611
## age 0.1974567
## reg 0.1968559
## wgt 0.1935516
##
##
## iter imp variable
## 1 1 wgt reg age sex hgt
## 1 2 wgt reg age sex hgt
NAs stats in missing dataset
wgt_nas <- plot_na_pie("wgt")

## [1] 1933

hgt_nas <- plot_na_pie("hgt")

## [1] 2007

age_nas <- plot_na_pie("age")

## [1] 1972

RF: Wight
rf:compare the imputed datasets with orignal dataset
df_rf_wgt <- create_compare_data(data,miss_data,impt_mice_rf_data,nas=wgt_nas,
col = "wgt",method = "rf",sp_impt="method")
ggplot(df_rf_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_rf_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

RF:compare split with Sex
df_rf_wgt <- create_compare_data(data,miss_data,impt_mice_rf_data,nas=wgt_nas,col = "wgt",method = "rf",sp_impt="sex")
ggplot(df_rf_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_wgt, aes(source,wgt, colour = source))+geom_boxplot()

RF:compare by NA counts
ggplot(df_rf_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_rf_wgt[grepl("4:|True",df_rf_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

GAMLSS: Wight
GAMLSS:compare the imputed datasets with orignal dataset
df_gamlss_wgt <- create_compare_data(data,miss_data,impt_mice_gamlss_data,nas=wgt_nas,
col = "wgt",method = "cart",sp_impt="method")
ggplot(df_gamlss_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_gamlss_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

GAMLSS:compare split with Sex
df_gamlss_wgt <- create_compare_data(data,miss_data,impt_mice_gamlss_data,nas=wgt_nas,col = "wgt",method = "cart",sp_impt="sex")
ggplot(df_gamlss_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_wgt, aes(source,wgt, colour = source))+geom_boxplot()

GAMLSS:compare by NA counts
ggplot(df_gamlss_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_gamlss_wgt[grepl("4:|True",df_gamlss_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

CART: Wight
CART:compare the imputed datasets with orignal dataset
df_cart_wgt <- create_compare_data(data,miss_data,impt_mice_cart_data,nas=wgt_nas,
col = "wgt",method = "cart",sp_impt="method")
ggplot(df_cart_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_wgt, aes(source,wgt, colour = source))+geom_boxplot()

ggplot(df_cart_wgt, aes(source,wgt, colour = source))+geom_boxplot(aes(colour=sex))

CART:compare split with Sex
df_cart_wgt <- create_compare_data(data,miss_data,impt_mice_cart_data,nas=wgt_nas,col = "wgt",method = "cart",sp_impt="sex")
ggplot(df_cart_wgt, aes(age,wgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_wgt, aes(source,wgt, colour = source))+geom_boxplot()

CART:compare by NA counts
ggplot(df_cart_wgt, aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_wgt, aes(na_count,wgt, colour = sex))+geom_boxplot()

ggplot(df_cart_wgt[grepl("4:|True",df_cart_wgt$na_count),], aes(age,wgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

RF: Height
rf:compare the imputed datasets with orignal dataset
df_rf_hgt <- create_compare_data(data,miss_data,impt_mice_rf_data,nas=hgt_nas,
col = "hgt",method = "rf",sp_impt="method")
ggplot(df_rf_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_rf_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

RF:compare split with Sex
df_rf_hgt <- create_compare_data(data,miss_data,impt_mice_rf_data,nas=hgt_nas,col = "hgt",method = "rf",sp_impt="sex")
ggplot(df_rf_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_hgt, aes(source,hgt, colour = source))+geom_boxplot()

RF:compare by NA counts
ggplot(df_rf_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_rf_hgt, aes(na_count,hgt, colour = sex))+geom_boxplot()

ggplot(df_rf_hgt[grepl("4:|True",df_rf_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

GAMLSS: Height
GAMLSS:compare the imputed datasets with orignal dataset
df_gamlss_hgt <- create_compare_data(data,miss_data,impt_mice_gamlss_data,nas=hgt_nas,
col = "hgt",method = "cart",sp_impt="method")
ggplot(df_gamlss_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_gamlss_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

GAMLSS:compare split with Sex
df_gamlss_hgt <- create_compare_data(data,miss_data,impt_mice_gamlss_data,nas=hgt_nas,col = "hgt",method = "cart",sp_impt="sex")
ggplot(df_gamlss_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_hgt, aes(source,hgt, colour = source))+geom_boxplot()

GAMLSS:compare by NA counts
ggplot(df_gamlss_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_gamlss_hgt, aes(na_count,hgt, colour = sex))+geom_boxplot()

ggplot(df_gamlss_hgt[grepl("4:|True",df_gamlss_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

CART: Height
CART:compare the imputed datasets with orignal dataset
df_cart_hgt <- create_compare_data(data,miss_data,impt_mice_cart_data,nas=hgt_nas,
col = "hgt",method = "cart",sp_impt="method")
ggplot(df_cart_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_hgt, aes(source,hgt, colour = source))+geom_boxplot()

ggplot(df_cart_hgt, aes(source,hgt, colour = source))+geom_boxplot(aes(colour=sex))

CART:compare split with Sex
df_cart_hgt <- create_compare_data(data,miss_data,impt_mice_cart_data,nas=hgt_nas,col = "hgt",method = "cart",sp_impt="sex")
ggplot(df_cart_hgt, aes(age,hgt, colour = source))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_hgt, aes(source,hgt, colour = source))+geom_boxplot()

CART:compare by NA counts
ggplot(df_cart_hgt, aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

ggplot(df_cart_hgt, aes(na_count,hgt, colour = sex))+geom_boxplot()

ggplot(df_cart_hgt[grepl("4:|True",df_cart_hgt$na_count),], aes(age,hgt, colour = na_count))+geom_point(alpha=0.4)+stat_smooth()

compare miss to true data:wgt
miss_index <- which(is.na(miss_data$wgt))
for (i in 1:10){
sex <- factor(data$sex[miss_index])
g1 <- qplot(data$wgt[miss_index],impt_mice_rf_data[[3]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
ylab("rf wgt") + xlab("data wgt")+theme(legend.position = "top")
g2 <- qplot(data$wgt[miss_index],impt_mice_gamlss_data[[3]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
ylab("gamlss wgt") + xlab("data wgt")+theme(legend.position = "top")
g3 <- qplot(data$wgt[miss_index],impt_mice_cart_data[[3]]$wgt[miss_index],col=sex)+stat_smooth()+ylim(-10, 105)+
ylab("cart wgt") + xlab("data wgt")+theme(legend.position = "top")
grid.arrange(g1, g2,g3, ncol=3)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

compare miss to true data:hgt
miss_index <- which(is.na(miss_data$hgt))
for (i in 1:10){
sex <- factor(data$sex[miss_index])
g1 <- qplot(data$hgt[miss_index],impt_mice_rf_data[[3]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
ylab("rf hgt") + xlab("data hgt")+theme(legend.position = "top")
g2 <- qplot(data$hgt[miss_index],impt_mice_gamlss_data[[3]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
ylab("gamlss hgt") + xlab("data hgt")+theme(legend.position = "top")
g3 <- qplot(data$hgt[miss_index],impt_mice_cart_data[[3]]$hgt[miss_index],col=sex)+stat_smooth()+ylim(30, 215)+
ylab("cart hgt") + xlab("data hgt")+theme(legend.position = "top")
grid.arrange(g1, g2,g3, ncol=3)
}










compare miss to true data:age
miss_index <- which(is.na(miss_data$age))
for (i in 1:10){
sex <- factor(data$sex[miss_index])
g1 <- qplot(data$age[miss_index],impt_mice_rf_data[[3]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
ylab("rf age") + xlab("data age")+theme(legend.position = "top")
g2 <- qplot(data$age[miss_index],impt_mice_gamlss_data[[3]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
ylab("gamlss age") + xlab("data age")+theme(legend.position = "top")
g3 <- qplot(data$age[miss_index],impt_mice_cart_data[[3]]$age[miss_index],col=sex)+stat_smooth()+ylim(-5,22)+
ylab("cart age") + xlab("data age")+theme(legend.position = "top")
grid.arrange(g1, g2,g3, ncol=3)
}









